Data Loading and Initial Exploration

library(tidyr)
library(lmtest)
library(MASS)
library(car)
library(dplyr)
library(ggplot2)
library(glmnet)
library(gridExtra)

cancer <- read.csv("/Users/kylie/Documents/Math372/cancer_reg.csv")
death.rate <- cancer$target_deathrate
hist(death.rate, main = "Death Rate Distribution", 
     xlab = "Death Rate", col = "lightblue", border = "white")
Distribution of Cancer Death Rates

Distribution of Cancer Death Rates

Data Preprocessing

We removed redundant and highly correlated features, then separated geographic information into county and state variables.

# Remove unwanted features
features_to_remove <- c("pctsomecol18_24", "pctprivatecoveragealone",
                        "pctpubliccoveragealone", "pctemployed16_over", "binnedinc",
                        "avganncount", "avgdeathsperyear", "popest2015")
cancer_clean <- cancer[, !(names(cancer) %in% features_to_remove)]

# Separate geography
cancer_clean <- cancer_clean %>%
  separate(geography, into = c("county", "state"), sep = ", ", remove = FALSE)

features_to_remove <- c("geography", "county")

# Convert state to factor
cancer_clean$state <- as.factor(cancer_clean$state)
cancer$state <- cancer_clean$state
cancer_clean <- cancer_clean[, !(names(cancer_clean) %in% features_to_remove)]

num_states <- length(levels(cancer_clean$state))

The dataset contains 51 states and 3047 counties.

Variable Selection with LASSO

cancer.rate <- c("incidencerate")
cancer_clean_death_pred <- cancer_clean[, !(names(cancer_clean) %in% cancer.rate)]
death_pred_matrix <- model.matrix(target_deathrate ~ ., data = cancer_clean_death_pred)
set.seed(1)
grid.lambda <- 10^seq(10, -2, length = 100)
x <- death_pred_matrix
y <- cancer_clean$target_deathrate
train <- sample(1:nrow(x), round(0.9 * nrow(x)))
test <- (-train)

x.train <- x[train, ]
x.test  <- x[test, ]
y.train <- y[train]
y.test  <- y[test]

cv.lasso <- cv.glmnet(x.train, y.train, alpha = 1)
best.lambda.lasso <- cv.lasso$lambda.min
lasso.pred <- predict(cv.lasso, s = best.lambda.lasso, newx = x.test)
mspe.lasso <- mean((lasso.pred - y.test)^2)

plot(cv.lasso)
LASSO Cross-Validation Results

LASSO Cross-Validation Results

The LASSO model achieved an MSPE of 511.9.

# Get LASSO coefficients at best lambda
lasso_coefs <- coef(cv.lasso, s = best.lambda.lasso)
selected_var_names <- rownames(lasso_coefs)[which(lasso_coefs != 0)]
selected_var_names <- selected_var_names[selected_var_names != "(Intercept)"]

# Create data frame with selected variables
x_no_intercept <- x[, -1]
available_vars <- colnames(x_no_intercept)
valid_selected_vars <- selected_var_names[selected_var_names %in% available_vars]

selected_data <- data.frame(
  target_deathrate = y,
  x_no_intercept[, valid_selected_vars]
)

# Fit OLS on selected variables
model <- lm(target_deathrate ~ ., data = selected_data)

Model Diagnostics

par(mfrow = c(2, 2))
plot(model)
Initial Model Diagnostics

Initial Model Diagnostics

par(mfrow = c(1, 1))
residuals <- model$residuals
sd_residuals <- sd(model$residuals)
mean_residuals <- mean(model$residuals)
stand_residuals <- (residuals - mean_residuals) / sd_residuals
shapiro_result <- shapiro.test(model$residuals)
bp_results <- bptest(model)

The Shapiro-Wilk normality test rejected the null hypothesis therefore our normality assumptions have not been met. Similarly, Breusch-Pagan constant variance test also rejected the null hypothesis therefore our variance was found to be heteroscedastic. We found heavy tails on the QQ plot therefore decided to subset our data into three subsets (left, center, right).

Data Subsetting and Transformation

model_center_subset <- lm(target_deathrate ~ ., 
                          data = selected_data[abs(stand_residuals) < 2, ])

Our center subset was still non normal and had a non constant variance so we decided to apply a Box-Cox transformation.

Box-Cox Transformation

bc_result <- boxcox(model_center_subset, plotit = TRUE)
Box-Cox Transformation Analysis

Box-Cox Transformation Analysis

optimal_lambda <- bc_result$x[which.max(bc_result$y)]
max_loglik <- max(bc_result$y)
ci_threshold <- max_loglik - qchisq(0.95, 1)/2
ci_indices <- which(bc_result$y >= ci_threshold)
lambda_ci <- range(bc_result$x[ci_indices])

Our optimal lambda is 0.707 with a 95% CI for lambda: [0.545, 0.869]. Therefore, we will apply a square root transformation. We will also apply square root transformations to our left and right subset, but we will focus our analysis on our center subset because it has the bulk of our data, >2800 observations.

Transformed Models

# MAIN MODEL --> where most of our data is >2800 data points focus analysis on this model
model_center_subset_sqrt <- lm(sqrt(target_deathrate) ~ ., 
                               data = selected_data[abs(stand_residuals) < 2, ])

# Left subset
model_left_subset <- lm(sqrt(target_deathrate) ~ ., 
                        data = selected_data[stand_residuals < -2, ])

# Right subset
model_right_subset <- lm(sqrt(target_deathrate) ~ ., 
                         data = selected_data[stand_residuals > 2, ])
par(mfrow = c(2, 2))
plot(model_center_subset_sqrt)
Transformed Center Model Diagnostics

Transformed Center Model Diagnostics

par(mfrow = c(1, 1))

Research Question 1: Insurance Coverage Effects

Research Question 2: Education Effects

Research Question 3: Geographic Effects